The data for this project is imported from an SQLite file. The data is the open-source Chinook dataset.
# Database connection ----
con <- DBI::dbConnect(SQLite(), "data/Chinook_Sqlite.sqlite")
# check the connection
con
## <SQLiteConnection>
## Path: C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite
## Extensions: TRUE
dbListTables(con)
## [1] "Album" "Artist" "Customer" "Employee"
## [5] "Genre" "Invoice" "InvoiceLine" "MediaType"
## [9] "Playlist" "PlaylistTrack" "Track"
# view all tables in the database
dbListTables(con) %>% map(~ tbl(con, .))
## [[1]]
## # Source: table<`Album`> [?? x 3]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## AlbumId Title ArtistId
## <int> <chr> <int>
## 1 1 For Those About To Rock We Salute You 1
## 2 2 Balls to the Wall 2
## 3 3 Restless and Wild 2
## 4 4 Let There Be Rock 1
## 5 5 Big Ones 3
## 6 6 Jagged Little Pill 4
## 7 7 Facelift 5
## 8 8 Warner 25 Anos 6
## 9 9 Plays Metallica By Four Cellos 7
## 10 10 Audioslave 8
## # ℹ more rows
##
## [[2]]
## # Source: table<`Artist`> [?? x 2]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## ArtistId Name
## <int> <chr>
## 1 1 AC/DC
## 2 2 Accept
## 3 3 Aerosmith
## 4 4 Alanis Morissette
## 5 5 Alice In Chains
## 6 6 Antônio Carlos Jobim
## 7 7 Apocalyptica
## 8 8 Audioslave
## 9 9 BackBeat
## 10 10 Billy Cobham
## # ℹ more rows
##
## [[3]]
## # Source: table<`Customer`> [?? x 13]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## CustomerId FirstName LastName Company Address City State Country PostalCode
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 Luís Gonçalves Embrae… Av. Br… São … SP Brazil 12227-000
## 2 2 Leonie Köhler <NA> Theodo… Stut… <NA> Germany 70174
## 3 3 François Tremblay <NA> 1498 r… Mont… QC Canada H2G 1A7
## 4 4 Bjørn Hansen <NA> Ullevå… Oslo <NA> Norway 0171
## 5 5 František Wichterl… JetBra… Klanov… Prag… <NA> Czech … 14700
## 6 6 Helena Holý <NA> Rilská… Prag… <NA> Czech … 14300
## 7 7 Astrid Gruber <NA> Rotent… Vien… <NA> Austria 1010
## 8 8 Daan Peeters <NA> Grétry… Brus… <NA> Belgium 1000
## 9 9 Kara Nielsen <NA> Sønder… Cope… <NA> Denmark 1720
## 10 10 Eduardo Martins Woodst… Rua Dr… São … SP Brazil 01007-010
## # ℹ more rows
## # ℹ 4 more variables: Phone <chr>, Fax <chr>, Email <chr>, SupportRepId <int>
##
## [[4]]
## # Source: table<`Employee`> [8 x 15]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## EmployeeId LastName FirstName Title ReportsTo BirthDate HireDate Address City
## <int> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 1 Adams Andrew Gene… NA 1962-02-… 2002-08… 11120 … Edmo…
## 2 2 Edwards Nancy Sale… 1 1958-12-… 2002-05… 825 8 … Calg…
## 3 3 Peacock Jane Sale… 2 1973-08-… 2002-04… 1111 6… Calg…
## 4 4 Park Margaret Sale… 2 1947-09-… 2003-05… 683 10… Calg…
## 5 5 Johnson Steve Sale… 2 1965-03-… 2003-10… 7727B … Calg…
## 6 6 Mitchell Michael IT M… 1 1973-07-… 2003-10… 5827 B… Calg…
## 7 7 King Robert IT S… 6 1970-05-… 2004-01… 590 Co… Leth…
## 8 8 Callahan Laura IT S… 6 1968-01-… 2004-03… 923 7 … Leth…
## # ℹ 6 more variables: State <chr>, Country <chr>, PostalCode <chr>,
## # Phone <chr>, Fax <chr>, Email <chr>
##
## [[5]]
## # Source: table<`Genre`> [?? x 2]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## GenreId Name
## <int> <chr>
## 1 1 Rock
## 2 2 Jazz
## 3 3 Metal
## 4 4 Alternative & Punk
## 5 5 Rock And Roll
## 6 6 Blues
## 7 7 Latin
## 8 8 Reggae
## 9 9 Pop
## 10 10 Soundtrack
## # ℹ more rows
##
## [[6]]
## # Source: table<`Invoice`> [?? x 9]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## InvoiceId CustomerId InvoiceDate BillingAddress BillingCity BillingState
## <int> <int> <chr> <chr> <chr> <chr>
## 1 1 2 2009-01-01 00:0… Theodor-Heuss… Stuttgart <NA>
## 2 2 4 2009-01-02 00:0… Ullevålsveien… Oslo <NA>
## 3 3 8 2009-01-03 00:0… Grétrystraat … Brussels <NA>
## 4 4 14 2009-01-06 00:0… 8210 111 ST NW Edmonton AB
## 5 5 23 2009-01-11 00:0… 69 Salem Stre… Boston MA
## 6 6 37 2009-01-19 00:0… Berger Straße… Frankfurt <NA>
## 7 7 38 2009-02-01 00:0… Barbarossastr… Berlin <NA>
## 8 8 40 2009-02-01 00:0… 8, Rue Hanovre Paris <NA>
## 9 9 42 2009-02-02 00:0… 9, Place Loui… Bordeaux <NA>
## 10 10 46 2009-02-03 00:0… 3 Chatham Str… Dublin Dublin
## # ℹ more rows
## # ℹ 3 more variables: BillingCountry <chr>, BillingPostalCode <chr>,
## # Total <dbl>
##
## [[7]]
## # Source: table<`InvoiceLine`> [?? x 5]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## InvoiceLineId InvoiceId TrackId UnitPrice Quantity
## <int> <int> <int> <dbl> <int>
## 1 1 1 2 0.99 1
## 2 2 1 4 0.99 1
## 3 3 2 6 0.99 1
## 4 4 2 8 0.99 1
## 5 5 2 10 0.99 1
## 6 6 2 12 0.99 1
## 7 7 3 16 0.99 1
## 8 8 3 20 0.99 1
## 9 9 3 24 0.99 1
## 10 10 3 28 0.99 1
## # ℹ more rows
##
## [[8]]
## # Source: table<`MediaType`> [5 x 2]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## MediaTypeId Name
## <int> <chr>
## 1 1 MPEG audio file
## 2 2 Protected AAC audio file
## 3 3 Protected MPEG-4 video file
## 4 4 Purchased AAC audio file
## 5 5 AAC audio file
##
## [[9]]
## # Source: table<`Playlist`> [?? x 2]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## PlaylistId Name
## <int> <chr>
## 1 1 Music
## 2 2 Movies
## 3 3 TV Shows
## 4 4 Audiobooks
## 5 5 90’s Music
## 6 6 Audiobooks
## 7 7 Movies
## 8 8 Music
## 9 9 Music Videos
## 10 10 TV Shows
## # ℹ more rows
##
## [[10]]
## # Source: table<`PlaylistTrack`> [?? x 2]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## PlaylistId TrackId
## <int> <int>
## 1 1 3402
## 2 1 3389
## 3 1 3390
## 4 1 3391
## 5 1 3392
## 6 1 3393
## 7 1 3394
## 8 1 3395
## 9 1 3396
## 10 1 3397
## # ℹ more rows
##
## [[11]]
## # Source: table<`Track`> [?? x 9]
## # Database: sqlite 3.46.0 [C:\Users\jeff_\Documents\R ML\Example_Feature_Engineering\data\Chinook_Sqlite.sqlite]
## TrackId Name AlbumId MediaTypeId GenreId Composer Milliseconds Bytes
## <int> <chr> <int> <int> <int> <chr> <int> <int>
## 1 1 For Those A… 1 1 1 Angus Y… 343719 1.12e7
## 2 2 Balls to th… 2 2 1 <NA> 342562 5.51e6
## 3 3 Fast As a S… 3 2 1 F. Balt… 230619 3.99e6
## 4 4 Restless an… 3 2 1 F. Balt… 252051 4.33e6
## 5 5 Princess of… 3 2 1 Deaffy … 375418 6.29e6
## 6 6 Put The Fin… 1 1 1 Angus Y… 205662 6.71e6
## 7 7 Let's Get I… 1 1 1 Angus Y… 233926 7.64e6
## 8 8 Inject The … 1 1 1 Angus Y… 210834 6.85e6
## 9 9 Snowballed 1 1 1 Angus Y… 203102 6.60e6
## 10 10 Evil Walks 1 1 1 Angus Y… 263497 8.61e6
## # ℹ more rows
## # ℹ 1 more variable: UnitPrice <dbl>
invoices_tbl <- tbl(con, "Invoice") %>% collect()
invoices_tbl <- invoices_tbl %>%
mutate(InvoiceDate = as_date(InvoiceDate))
invoices_tbl %>% glimpse()
## Rows: 412
## Columns: 9
## $ InvoiceId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ CustomerId <int> 2, 4, 8, 14, 23, 37, 38, 40, 42, 46, 52, 2, 16, 17, …
## $ InvoiceDate <date> 2009-01-01, 2009-01-02, 2009-01-03, 2009-01-06, 200…
## $ BillingAddress <chr> "Theodor-Heuss-Straße 34", "Ullevålsveien 14", "Grét…
## $ BillingCity <chr> "Stuttgart", "Oslo", "Brussels", "Edmonton", "Boston…
## $ BillingState <chr> NA, NA, NA, "AB", "MA", NA, NA, NA, NA, "Dublin", NA…
## $ BillingCountry <chr> "Germany", "Norway", "Belgium", "Canada", "USA", "Ge…
## $ BillingPostalCode <chr> "70174", "0171", "1000", "T6G 2C7", "2113", "60316",…
## $ Total <dbl> 1.98, 3.96, 5.94, 8.91, 13.86, 0.99, 1.98, 1.98, 3.9…
# save the imported data for future use
#invoices_tbl %>% write_rds("data/invoices_tbl.rds")
customers_tbl <- tbl(con, "Customer") %>% collect()
customers_tbl %>% glimpse()
## Rows: 59
## Columns: 13
## $ CustomerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ FirstName <chr> "Luís", "Leonie", "François", "Bjørn", "František", "Hele…
## $ LastName <chr> "Gonçalves", "Köhler", "Tremblay", "Hansen", "Wichterlová…
## $ Company <chr> "Embraer - Empresa Brasileira de Aeronáutica S.A.", NA, N…
## $ Address <chr> "Av. Brigadeiro Faria Lima, 2170", "Theodor-Heuss-Straße …
## $ City <chr> "São José dos Campos", "Stuttgart", "Montréal", "Oslo", "…
## $ State <chr> "SP", NA, "QC", NA, NA, NA, NA, NA, NA, "SP", "SP", "RJ",…
## $ Country <chr> "Brazil", "Germany", "Canada", "Norway", "Czech Republic"…
## $ PostalCode <chr> "12227-000", "70174", "H2G 1A7", "0171", "14700", "14300"…
## $ Phone <chr> "+55 (12) 3923-5555", "+49 0711 2842222", "+1 (514) 721-4…
## $ Fax <chr> "+55 (12) 3923-5566", NA, NA, NA, "+420 2 4172 5555", NA,…
## $ Email <chr> "luisg@embraer.com.br", "leonekohler@surfeu.de", "ftrembl…
## $ SupportRepId <int> 3, 5, 3, 4, 4, 5, 5, 4, 4, 4, 5, 3, 4, 5, 3, 4, 5, 3, 3, …
# save the imported data for future use
#customers_tbl %>% write_rds("data/customers_tbl.rds")
This table needs to be amended in order to be more useful for the analysis: - Genre, Album, Artist are added to the table. - The invoice and customer are also requried in this table to allow for trends to be examined by customer.
invoice_lines_tbl <- tbl(con, "InvoiceLine") %>% #needs more information to be useful - pull it in too
left_join(
tbl(con, "Track") %>%
select(-UnitPrice) %>%
rename(TrackName = Name),
by = "TrackId"
) %>%
left_join(
tbl(con, "Genre") %>% rename(GenreName = Name), by = "GenreId"
) %>%
left_join(
tbl(con, "Album") %>% rename(AlbumTitle = Title), by = "AlbumId"
) %>%
left_join(
tbl(con, "Artist") %>% rename(ArtistName = Name), by = "ArtistId"
) %>%
left_join(
tbl(con, "Invoice") %>% select(InvoiceId, CustomerId), #needed to be able to mine for trends by customer
by = "InvoiceId"
) %>%
select(-ends_with("Id"), starts_with("Invoice"), starts_with("Customer")) %>%
relocate(contains("Id"), .before = 1) %>%
collect()
invoice_lines_tbl %>% glimpse()
## Rows: 2,240
## Columns: 12
## $ InvoiceLineId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ InvoiceId <int> 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4,…
## $ CustomerId <int> 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 14, 14, 14, 14, 14, …
## $ UnitPrice <dbl> 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.…
## $ Quantity <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ TrackName <chr> "Balls to the Wall", "Restless and Wild", "Put The Finge…
## $ Composer <chr> NA, "F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. Dirksc…
## $ Milliseconds <int> 342562, 252051, 205662, 210834, 263497, 263288, 215196, …
## $ Bytes <int> 5510424, 4331779, 6713451, 6852860, 8611245, 8596840, 70…
## $ GenreName <chr> "Rock", "Rock", "Rock", "Rock", "Rock", "Rock", "Rock", …
## $ AlbumTitle <chr> "Balls to the Wall", "Restless and Wild", "For Those Abo…
## $ ArtistName <chr> "Accept", "Accept", "AC/DC", "AC/DC", "AC/DC", "AC/DC", …
# save the imported data for future use
#invoice_lines_tbl %>% write_rds("data/invoice_lines_tbl.rds")
# check the dataset
invoice_lines_tbl %>% skim()
| Name | Piped data |
| Number of rows | 2240 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| TrackName | 0 | 1.00 | 2 | 123 | 0 | 1888 | 0 |
| Composer | 596 | 0.73 | 2 | 188 | 0 | 572 | 0 |
| GenreName | 0 | 1.00 | 3 | 18 | 0 | 24 | 0 |
| AlbumTitle | 0 | 1.00 | 2 | 95 | 0 | 304 | 0 |
| ArtistName | 0 | 1.00 | 2 | 85 | 0 | 165 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| InvoiceLineId | 0 | 1 | 1120.50 | 646.78 | 1.00 | 560.75 | 1120.50 | 1680.25 | 2.240000e+03 | ▇▇▇▇▇ |
| InvoiceId | 0 | 1 | 206.87 | 119.13 | 1.00 | 103.00 | 207.00 | 311.00 | 4.120000e+02 | ▇▇▇▇▇ |
| CustomerId | 0 | 1 | 29.97 | 17.02 | 1.00 | 15.00 | 30.00 | 45.00 | 5.900000e+01 | ▇▇▇▇▇ |
| UnitPrice | 0 | 1 | 1.04 | 0.22 | 0.99 | 0.99 | 0.99 | 0.99 | 1.990000e+00 | ▇▁▁▁▁ |
| Quantity | 0 | 1 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | ▁▁▇▁▁ |
| Milliseconds | 0 | 1 | 375435.99 | 507221.98 | 6373.00 | 205981.50 | 256443.50 | 321136.00 | 5.286953e+06 | ▇▁▁▁▁ |
| Bytes | 0 | 1 | 29975905.11 | 100251448.46 | 211997.00 | 6332437.25 | 8104544.00 | 10200177.25 | 1.059546e+09 | ▇▁▁▁▁ |
DBI::dbDisconnect(con)
This section attempts to create variables that might be useful for the analysis. Variable creation and reduction are both important here. There needs to be a sufficient number of useful variables that help to improve a model accuracy, without overfitting (not being useful for new data). Large numbers of similar types of variables are reduced down to mapped components or principal component variables for the analysis.
invoice_lines_tbl %>% distinct(ArtistName)
## # A tibble: 165 × 1
## ArtistName
## <chr>
## 1 Accept
## 2 AC/DC
## 3 Aerosmith
## 4 Alanis Morissette
## 5 Alice In Chains
## 6 Antônio Carlos Jobim
## 7 Apocalyptica
## 8 Audioslave
## 9 BackBeat
## 10 Billy Cobham
## # ℹ 155 more rows
## Pivot Longer (Dummy) ----
customer_artists_tbl <- invoice_lines_tbl %>% # what artists are customers buying from?
select(CustomerId, ArtistName) %>%
count(CustomerId, ArtistName) %>% # count frequency
pivot_wider( #make dummy columns by pivoting the data - for each customer, which artists?
names_from = ArtistName,
values_from = n,
values_fill = 0,
names_prefix = "artist_",
names_sep = "_"
)
customer_artists_tbl
## # A tibble: 59 × 166
## CustomerId artist_Academy of St. Martin in the Field…¹ artist_Battlestar Ga…²
## <int> <int> <int>
## 1 1 1 2
## 2 2 0 0
## 3 3 0 0
## 4 4 0 0
## 5 5 0 0
## 6 6 0 0
## 7 7 0 0
## 8 8 0 0
## 9 9 0 0
## 10 10 0 0
## # ℹ 49 more rows
## # ℹ abbreviated names:
## # ¹`artist_Academy of St. Martin in the Fields & Sir Neville Marriner`,
## # ²`artist_Battlestar Galactica (Classic)`
## # ℹ 163 more variables: `artist_Berliner Philharmoniker & Hans Rosbaud` <int>,
## # `artist_Chico Science & Nação Zumbi` <int>, `artist_Cidade Negra` <int>,
## # `artist_Cláudio Zoli` <int>, …
# save the data frame
#customer_artists_tbl %>% write_rds("data/customer_artists_tbl.rds")
The recipes library is used in the below dimensionality reduction code in order to automate the process. A seed is set in order to make it reproducable.
# Dimensionality Reduction with UMAP ----
recipe_spec_umap <- recipe(~ ., customer_artists_tbl) %>%
step_umap(
-CustomerId,
num_comp = 20, #condenses the information into 20 columns
retain = FALSE,
seed = c(123, 123),
)
customer_artists_umap_tbl <- recipe_spec_umap %>% prep() %>% juice()
customer_artists_umap_tbl
## # A tibble: 59 × 21
## CustomerId UMAP01 UMAP02 UMAP03 UMAP04 UMAP05 UMAP06 UMAP07 UMAP08
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.278 0.0385 0.253 0.293 -0.247 -0.302 -0.191 0.246
## 2 2 0.289 -0.434 -0.279 0.276 0.0902 -0.202 -0.124 0.0520
## 3 3 0.0272 -0.140 -0.0369 -0.124 -0.125 -0.501 0.512 -0.569
## 4 4 0.502 0.0893 0.00563 0.257 -0.635 0.165 0.0673 -0.295
## 5 5 -0.496 -0.00970 0.0147 -0.212 0.331 0.631 -0.512 0.736
## 6 6 0.171 0.0128 0.646 0.104 -0.726 0.398 -0.522 0.720
## 7 7 -0.103 0.115 0.549 0.0461 -0.360 0.411 -0.578 0.989
## 8 8 -0.327 -0.170 -0.216 -0.476 0.410 -0.161 0.421 -0.706
## 9 9 0.0108 0.329 0.535 -0.0634 -0.474 0.00439 0.365 0.452
## 10 10 0.326 -0.169 -0.133 0.346 -0.0695 0.00480 -0.0303 0.345
## # ℹ 49 more rows
## # ℹ 12 more variables: UMAP09 <dbl>, UMAP10 <dbl>, UMAP11 <dbl>, UMAP12 <dbl>,
## # UMAP13 <dbl>, UMAP14 <dbl>, UMAP15 <dbl>, UMAP16 <dbl>, UMAP17 <dbl>,
## # UMAP18 <dbl>, UMAP19 <dbl>, UMAP20 <dbl>
# save the data frame
#customer_artists_umap_tbl %>% write_rds("data/customer_artists_umap_tbl.rds")
The 2D plot below shows all customers within the context of the first 2 of the 20 dimensionality-reduced variables (UMAP variables). These can later be used to classify customers into different groups.
g <- customer_artists_umap_tbl %>%
ggplot(aes(UMAP01, UMAP02)) +
geom_point(aes(text = CustomerId), alpha = 0.5)
ggplotly(g)
This can also be plotted in 3D, with up to 4 of the UMAP variables (one can be shown as the colour)
customer_artists_umap_tbl %>%
plot_ly(x = ~ UMAP01, y = ~ UMAP02, z = ~ UMAP03, color = ~ UMAP04,
text = ~ CustomerId) %>%
add_markers()
In addition to the mapped variables, the invoice lines data frame can be investigated for information on each individual customer’s preferences.
This could include determining the preferred artist for each customer. See below the most popular artists of customer numbers 16, 35, and 55:
invoice_lines_tbl %>%
filter(CustomerId %in% c(35, 55, 16)) %>%
count(CustomerId, ArtistName) %>%
group_by(CustomerId) %>%
arrange(-n, .by_group = TRUE) %>%
slice(1:5)
## # A tibble: 15 × 3
## # Groups: CustomerId [3]
## CustomerId ArtistName n
## <int> <chr> <int>
## 1 16 Iron Maiden 14
## 2 16 Metallica 6
## 3 16 Van Halen 6
## 4 16 Gilberto Gil 4
## 5 16 Antônio Carlos Jobim 2
## 6 35 Iron Maiden 16
## 7 35 U2 9
## 8 35 Os Paralamas Do Sucesso 3
## 9 35 Ozzy Osbourne 3
## 10 35 Djavan 2
## 11 55 Iron Maiden 18
## 12 55 U2 5
## 13 55 Ozzy Osbourne 3
## 14 55 Page & Plant 3
## 15 55 Creedence Clearwater Revival 2
Or the preferred artist and genre of customer numbers 32 and 52:
invoice_lines_tbl %>%
filter(CustomerId %in% c(32, 52)) %>%
count(CustomerId, GenreName, ArtistName) %>%
group_by(CustomerId) %>%
arrange(-n, .by_group = TRUE) %>%
slice(1:5)
## # A tibble: 10 × 4
## # Groups: CustomerId [2]
## CustomerId GenreName ArtistName n
## <int> <chr> <chr> <int>
## 1 32 Latin Chico Science & Nação Zumbi 6
## 2 32 Latin Os Paralamas Do Sucesso 4
## 3 32 Metal Metallica 4
## 4 32 Reggae Cidade Negra 3
## 5 32 Rock Nirvana 3
## 6 52 Reggae Cidade Negra 5
## 7 52 Latin Legião Urbana 4
## 8 52 Latin Lulu Santos 4
## 9 52 Metal Metallica 4
## 10 52 Rock Stone Temple Pilots 3
In this section, the length of an individual song is investigated as a potential variable. The various lengths of songs are divided into 5 buckets, depending on where the song length is compared to all of the other songs. In other words, the shortest songs will be in the first, and so on until the longest songs are in the final bucket.
customer_song_len_tbl <- invoice_lines_tbl %>%
select(CustomerId, Milliseconds) %>%
group_by(CustomerId) %>%
summarise(
enframe(quantile(Milliseconds, probs = c(0, 0.25, 0.5, 0.75, 1)))
) %>%
ungroup() %>%
mutate(name = str_remove_all(name, "%")) %>%
pivot_wider(
names_from = name,
values_from = value,
names_prefix = "song_len_q"
)
customer_song_len_tbl %>%
arrange(-song_len_q100)
## # A tibble: 59 × 6
## CustomerId song_len_q0 song_len_q25 song_len_q50 song_len_q75 song_len_q100
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 51 133172 198504. 226572. 308426. 5286953
## 2 40 71941 192188. 241488. 304375 5088838
## 3 34 116767 202328. 260569 291905. 2960293
## 4 42 6373 211905 295601 361410. 2956081
## 5 28 116767 225305. 273762. 334804 2952702
## 6 1 71627 196342. 231960 282038 2927677
## 7 45 143725 234553. 284460 471986. 2925008
## 8 44 102164 204578. 262844. 300401. 2924716
## 9 48 112613 201273. 258991 294190. 2924007
## 10 59 131918 192456. 243042. 302909 2922088
## # ℹ 49 more rows
This section looks at creating variables to model date (i.e. seasonal) variables, and price features. This investigation uses the invoices data frame.
max_date <- max(invoices_tbl$InvoiceDate)
customer_invoice_tbl <- invoices_tbl %>%
select(CustomerId, InvoiceDate, Total) %>%
group_by(CustomerId) %>%
summarise(
#Date features
inv_most_recent_purchase = (max(InvoiceDate) - max_date) / ddays(1), #when was most recent purchase?
inv_tenure = (min(InvoiceDate) - max_date) / ddays(1), #when was first purchase?
#Price features
inv_count = n(), #how many invoices?
inv_sum = sum(Total, na.rm = TRUE), #total purchase amount?
inv_avg = mean(Total, na.rm = TRUE) #average purchase amount?
)
customer_invoice_tbl %>%
ggpairs(
columns = 2:ncol(.),
title = "Customer Aggregated Invoice Features"
)
# save the data frame
#customer_invoice_tbl %>% write_csv("data/customer_invoice_tbl.rds")
This section looks at the customer table to determine useful variables from customer data
customers_tbl %>% skim()
| Name | Piped data |
| Number of rows | 59 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| FirstName | 0 | 1.00 | 3 | 9 | 0 | 57 | 0 |
| LastName | 0 | 1.00 | 4 | 12 | 0 | 59 | 0 |
| Company | 49 | 0.17 | 5 | 48 | 0 | 10 | 0 |
| Address | 0 | 1.00 | 11 | 40 | 0 | 59 | 0 |
| City | 0 | 1.00 | 4 | 19 | 0 | 53 | 0 |
| State | 29 | 0.51 | 2 | 6 | 0 | 25 | 0 |
| Country | 0 | 1.00 | 3 | 14 | 0 | 24 | 0 |
| PostalCode | 4 | 0.93 | 4 | 10 | 0 | 55 | 0 |
| Phone | 1 | 0.98 | 14 | 19 | 0 | 58 | 0 |
| Fax | 47 | 0.20 | 16 | 18 | 0 | 12 | 0 |
| 0 | 1.00 | 15 | 29 | 0 | 59 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CustomerId | 0 | 1 | 30.00 | 17.18 | 1 | 15.5 | 30 | 44.5 | 59 | ▇▇▇▇▇ |
| SupportRepId | 0 | 1 | 3.95 | 0.82 | 3 | 3.0 | 4 | 5.0 | 5 | ▇▁▇▁▇ |
## Joining ----
customers_joined_tbl <- customers_tbl %>% # selecting elements that we want
select(contains("Id"), PostalCode, Country, City) %>%
left_join(
customer_invoice_tbl, by = "CustomerId"
) %>%
left_join(
customer_song_len_tbl, by = "CustomerId"
) %>%
left_join(
customer_artists_umap_tbl, by = "CustomerId"
) %>%
rename_at(.vars = vars(starts_with("UMAP")), .funs = ~ str_glue("artist_{.}"))
customers_joined_tbl %>% glimpse()
## Rows: 59
## Columns: 35
## $ CustomerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ SupportRepId <int> 3, 5, 3, 4, 4, 5, 5, 4, 4, 4, 5, 3, 4, 5, 3, …
## $ PostalCode <chr> "12227-000", "70174", "H2G 1A7", "0171", "147…
## $ Country <chr> "Brazil", "Germany", "Canada", "Norway", "Cze…
## $ City <chr> "São José dos Campos", "Stuttgart", "Montréal…
## $ inv_most_recent_purchase <dbl> -137, -527, -93, -80, -230, -39, -186, -79, -…
## $ inv_tenure <dbl> -1382, -1816, -1382, -1815, -1475, -1625, -14…
## $ inv_count <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ inv_sum <dbl> 39.62, 37.62, 39.62, 39.62, 40.62, 49.62, 42.…
## $ inv_avg <dbl> 5.660000, 5.374286, 5.660000, 5.660000, 5.802…
## $ song_len_q0 <dbl> 71627, 38164, 47333, 159216, 96914, 43232, 63…
## $ song_len_q25 <dbl> 196342.5, 208404.2, 201377.2, 233018.2, 20572…
## $ song_len_q50 <dbl> 231960.0, 233534.0, 256221.5, 272554.0, 27083…
## $ song_len_q75 <dbl> 282038.0, 277648.2, 380734.2, 360554.2, 32956…
## $ song_len_q100 <dbl> 2927677, 619467, 2610860, 2612779, 2601017, 2…
## $ artist_UMAP01 <dbl> 0.27786273, 0.28875083, 0.02717849, 0.5015313…
## $ artist_UMAP02 <dbl> 0.03845246, -0.43448150, -0.14017516, 0.08931…
## $ artist_UMAP03 <dbl> 0.2534448802, -0.2788623571, -0.0369389988, 0…
## $ artist_UMAP04 <dbl> 0.293174714, 0.276165307, -0.124304600, 0.257…
## $ artist_UMAP05 <dbl> -0.24703872, 0.09015495, -0.12503950, -0.6351…
## $ artist_UMAP06 <dbl> -0.301952958, -0.201517597, -0.500993252, 0.1…
## $ artist_UMAP07 <dbl> -0.19075727, -0.12399616, 0.51196265, 0.06727…
## $ artist_UMAP08 <dbl> 0.24619992, 0.05196908, -0.56862468, -0.29487…
## $ artist_UMAP09 <dbl> -0.185766757, 0.012162508, 0.263629705, -0.42…
## $ artist_UMAP10 <dbl> -0.23017897, -0.11081221, 0.39774135, 0.45139…
## $ artist_UMAP11 <dbl> -0.29376933, -0.62651455, 0.03085709, -0.3396…
## $ artist_UMAP12 <dbl> 0.146607399, -0.241931558, -0.015507187, -0.0…
## $ artist_UMAP13 <dbl> 0.029352857, 0.540772498, 0.147428066, 0.3670…
## $ artist_UMAP14 <dbl> -0.87143624, 0.02660123, -0.57741731, 0.66189…
## $ artist_UMAP15 <dbl> -0.18326588, -0.10868818, -0.63507593, -0.061…
## $ artist_UMAP16 <dbl> 0.146141291, -0.346388996, -0.525793552, 0.20…
## $ artist_UMAP17 <dbl> 0.31453398, 0.20617893, -0.41300654, -0.37002…
## $ artist_UMAP18 <dbl> -0.049057435, -0.361004978, -0.277115434, -0.…
## $ artist_UMAP19 <dbl> -0.131123990, -0.069565997, -0.018829366, 0.8…
## $ artist_UMAP20 <dbl> -0.16237900, 0.17466438, 0.46588811, -0.21192…
customers_joined_tbl %>% skim()
| Name | Piped data |
| Number of rows | 59 |
| Number of columns | 35 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 32 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PostalCode | 4 | 0.93 | 4 | 10 | 0 | 55 | 0 |
| Country | 0 | 1.00 | 3 | 14 | 0 | 24 | 0 |
| City | 0 | 1.00 | 4 | 19 | 0 | 53 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CustomerId | 0 | 1 | 30.00 | 17.18 | 1.00 | 15.50 | 30.00 | 44.50 | 59.00 | ▇▇▇▇▇ |
| SupportRepId | 0 | 1 | 3.95 | 0.82 | 3.00 | 3.00 | 4.00 | 5.00 | 5.00 | ▇▁▇▁▇ |
| inv_most_recent_purchase | 0 | 1 | -208.39 | 166.00 | -571.00 | -332.00 | -168.00 | -72.50 | 0.00 | ▂▂▃▅▇ |
| inv_tenure | 0 | 1 | -1612.14 | 164.30 | -1816.00 | -1752.50 | -1656.00 | -1490.50 | -1258.00 | ▇▅▂▃▂ |
| inv_count | 0 | 1 | 6.98 | 0.13 | 6.00 | 7.00 | 7.00 | 7.00 | 7.00 | ▁▁▁▁▇ |
| inv_sum | 0 | 1 | 39.47 | 2.91 | 36.64 | 37.62 | 37.62 | 39.62 | 49.62 | ▇▂▁▁▁ |
| inv_avg | 0 | 1 | 5.65 | 0.42 | 5.37 | 5.37 | 5.37 | 5.73 | 7.09 | ▇▁▁▁▁ |
| song_len_q0 | 0 | 1 | 103176.27 | 43732.33 | 6373.00 | 68610.00 | 116767.00 | 137246.50 | 166680.00 | ▃▃▅▇▆ |
| song_len_q25 | 0 | 1 | 210537.09 | 17410.59 | 165250.00 | 200146.62 | 210115.50 | 222239.38 | 244701.75 | ▁▅▇▆▃ |
| song_len_q50 | 0 | 1 | 257725.05 | 23670.65 | 199209.00 | 241762.75 | 257867.50 | 274311.25 | 312946.00 | ▁▆▇▆▂ |
| song_len_q75 | 0 | 1 | 393025.97 | 369601.04 | 249221.00 | 295953.50 | 314415.75 | 339088.25 | 2586029.75 | ▇▁▁▁▁ |
| song_len_q100 | 0 | 1 | 1633996.15 | 1247136.81 | 375418.00 | 520855.00 | 934791.00 | 2658005.50 | 5286953.00 | ▇▁▆▁▁ |
| artist_UMAP01 | 0 | 1 | 0.00 | 0.30 | -0.56 | -0.25 | -0.03 | 0.25 | 0.58 | ▅▅▆▇▃ |
| artist_UMAP02 | 0 | 1 | 0.00 | 0.28 | -0.57 | -0.17 | 0.01 | 0.14 | 0.65 | ▃▃▇▂▂ |
| artist_UMAP03 | 0 | 1 | -0.01 | 0.33 | -0.59 | -0.21 | -0.05 | 0.22 | 0.72 | ▃▇▆▃▃ |
| artist_UMAP04 | 0 | 1 | 0.00 | 0.23 | -0.48 | -0.18 | 0.00 | 0.22 | 0.45 | ▃▇▇▆▅ |
| artist_UMAP05 | 0 | 1 | 0.00 | 0.42 | -0.73 | -0.32 | -0.05 | 0.34 | 0.76 | ▆▇▆▇▆ |
| artist_UMAP06 | 0 | 1 | 0.01 | 0.33 | -0.66 | -0.28 | 0.03 | 0.25 | 0.70 | ▃▆▇▇▂ |
| artist_UMAP07 | 0 | 1 | 0.01 | 0.36 | -0.61 | -0.24 | 0.00 | 0.29 | 0.62 | ▆▆▇▆▇ |
| artist_UMAP08 | 0 | 1 | -0.03 | 0.54 | -1.01 | -0.46 | 0.06 | 0.41 | 1.05 | ▅▅▆▇▂ |
| artist_UMAP09 | 0 | 1 | -0.01 | 0.25 | -0.43 | -0.21 | -0.02 | 0.15 | 0.62 | ▆▇▇▃▂ |
| artist_UMAP10 | 0 | 1 | 0.02 | 0.28 | -0.61 | -0.22 | 0.02 | 0.27 | 0.45 | ▂▅▆▃▇ |
| artist_UMAP11 | 0 | 1 | 0.00 | 0.34 | -0.63 | -0.26 | 0.03 | 0.31 | 0.58 | ▆▇▆▇▇ |
| artist_UMAP12 | 0 | 1 | 0.02 | 0.21 | -0.35 | -0.17 | 0.01 | 0.18 | 0.46 | ▇▆▇▇▃ |
| artist_UMAP13 | 0 | 1 | 0.01 | 0.26 | -0.43 | -0.22 | 0.01 | 0.25 | 0.54 | ▇▇▆▇▅ |
| artist_UMAP14 | 0 | 1 | -0.04 | 0.54 | -1.21 | -0.46 | 0.00 | 0.31 | 1.09 | ▂▅▇▇▂ |
| artist_UMAP15 | 0 | 1 | -0.02 | 0.31 | -0.64 | -0.20 | -0.06 | 0.19 | 0.55 | ▃▅▇▆▃ |
| artist_UMAP16 | 0 | 1 | 0.01 | 0.36 | -0.76 | -0.24 | 0.10 | 0.22 | 0.82 | ▃▆▇▇▂ |
| artist_UMAP17 | 0 | 1 | -0.02 | 0.31 | -0.56 | -0.30 | 0.00 | 0.24 | 0.55 | ▇▇▇▇▅ |
| artist_UMAP18 | 0 | 1 | 0.01 | 0.26 | -0.43 | -0.19 | -0.03 | 0.15 | 0.59 | ▅▇▇▃▂ |
| artist_UMAP19 | 0 | 1 | 0.02 | 0.37 | -0.55 | -0.22 | -0.02 | 0.20 | 0.87 | ▅▇▅▃▂ |
| artist_UMAP20 | 0 | 1 | 0.01 | 0.40 | -0.72 | -0.23 | -0.03 | 0.25 | 0.83 | ▃▇▇▃▃ |
# save the data frame
#customers_joined_tbl %>% write_rds("data/customers_joined_tbl.rds")
With the different variables created, the modelling can now begin. The purpose of the model is to determine: What is the likelihood of a customer making a new purchase within 90 days?
In this case, as single model has been implemented, “XGBoost”. In a typical analysis, multiple kinds of models would be created, in a similar way to the following XGBoost modelling process. These would then be compared for accuracy and overfitting, the best models selected, and used to make a more accurate prediction. “Ensembles” can also be made from these multiple models, so that they work together to give a better prediction.
For the purpose of this project, only a single XGBoost model has been created however.
This creates the “target” variable - the likelihood that a customer will make a new purchase within 90 days.
full_data_tbl <- customers_joined_tbl %>%
mutate(Target = ifelse(inv_most_recent_purchase >= -90, 1, 0)) %>%
mutate(Target = as.factor(Target)) %>%
select(-inv_most_recent_purchase) %>%
relocate(Target, .after = CustomerId)
# save data frame
#full_data_tbl %>% write_rds("data/full_data_tbl.rds")
This is to ensure that a model can be tested on “new” data to verify that it is indeed as good as expected.
set.seed(123)
splits <- initial_split(full_data_tbl, prop = 0.80) # 80% training data, 20% for testing
# save
#write_rds(splits, "data/splits.rds")
recipe_spec_hash <- recipe(Target ~ ., training(splits)) %>%
add_role(CustomerId, new_role = "Id") %>%
step_dummy_hash(Country, City, PostalCode, num_terms = 15) #hashing compresses the wide data into fewer columns
recipe_spec_hash %>% prep() %>% juice() %>% glimpse()
## Rows: 47
## Columns: 77
## $ CustomerId <int> 31, 15, 51, 14, 3, 42, 50, 43, 37, 56, 25, 26,…
## $ SupportRepId <int> 5, 3, 5, 5, 3, 3, 5, 3, 3, 4, 5, 4, 4, 4, 5, 5…
## $ inv_tenure <dbl> -1749, -1661, -1630, -1811, -1382, -1784, -164…
## $ inv_count <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7…
## $ inv_sum <dbl> 37.62, 38.62, 38.62, 37.62, 39.62, 39.62, 37.6…
## $ inv_avg <dbl> 5.374286, 5.517143, 5.517143, 5.374286, 5.6600…
## $ song_len_q0 <dbl> 34168, 114520, 133172, 142889, 47333, 6373, 65…
## $ song_len_q25 <dbl> 212897.5, 194685.0, 198503.8, 215646.8, 201377…
## $ song_len_q50 <dbl> 254942.0, 226912.0, 226572.5, 248502.0, 256221…
## $ song_len_q75 <dbl> 300891.0, 306252.5, 308426.2, 333086.8, 380734…
## $ song_len_q100 <dbl> 618031, 2611903, 5286953, 634435, 2610860, 295…
## $ artist_UMAP01 <dbl> -0.11948468, 0.17388958, 0.18722062, 0.3272512…
## $ artist_UMAP02 <dbl> 0.002805299, 0.103563838, 0.141357005, 0.08705…
## $ artist_UMAP03 <dbl> -0.1323062778, -0.0001145129, 0.1417288929, -0…
## $ artist_UMAP04 <dbl> -0.037754979, 0.260685116, 0.159957990, 0.2975…
## $ artist_UMAP05 <dbl> 0.40312889, -0.03294696, -0.04930247, -0.34193…
## $ artist_UMAP06 <dbl> -0.391777486, -0.074248962, -0.636813462, -0.0…
## $ artist_UMAP07 <dbl> 0.24607128, -0.25400597, 0.36081001, -0.090241…
## $ artist_UMAP08 <dbl> -0.22558951, 0.05578769, -0.20794505, -0.52618…
## $ artist_UMAP09 <dbl> 0.155560985, -0.219373062, 0.007724192, -0.189…
## $ artist_UMAP10 <dbl> -0.13440348, -0.25852975, 0.01007290, 0.100798…
## $ artist_UMAP11 <dbl> 0.30268568, -0.04095987, 0.07176474, -0.231664…
## $ artist_UMAP12 <dbl> 0.137900412, 0.359694660, 0.181509390, 0.46419…
## $ artist_UMAP13 <dbl> -0.223153949, 0.006784141, -0.191693127, 0.233…
## $ artist_UMAP14 <dbl> -1.016996503, -0.751374900, -1.055415154, -0.1…
## $ artist_UMAP15 <dbl> 0.17027719, 0.12868741, -0.05125653, -0.174491…
## $ artist_UMAP16 <dbl> -0.11649100, 0.52428603, -0.16132049, 0.538100…
## $ artist_UMAP17 <dbl> 0.033068459, 0.262019932, 0.008767323, -0.0524…
## $ artist_UMAP18 <dbl> -0.061739348, 0.210888311, -0.218341172, 0.156…
## $ artist_UMAP19 <dbl> -0.532718360, -0.036126595, -0.384794027, 0.38…
## $ artist_UMAP20 <dbl> 0.40559691, -0.23119453, 0.36045253, -0.112732…
## $ Target <fct> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0…
## $ dummyhash_Country_01 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_02 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0…
## $ dummyhash_Country_03 <int> 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_Country_04 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ dummyhash_Country_05 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_06 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_07 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_08 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_09 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_10 <int> -1, -1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_Country_11 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ dummyhash_Country_12 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_13 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -1, 0, …
## $ dummyhash_Country_14 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_Country_15 <int> 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0,…
## $ dummyhash_City_01 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -1, …
## $ dummyhash_City_02 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, …
## $ dummyhash_City_03 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_04 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_05 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_06 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -…
## $ dummyhash_City_07 <int> 0, 0, -1, 1, 0, 0, 0, 0, 1, 0, -1, 0, 0, 0, 0,…
## $ dummyhash_City_08 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0,…
## $ dummyhash_City_09 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -…
## $ dummyhash_City_10 <int> 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_City_11 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0,…
## $ dummyhash_City_12 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_13 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_14 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ dummyhash_City_15 <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_01 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_02 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_03 <int> 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0,…
## $ dummyhash_PostalCode_04 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_05 <int> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_06 <int> 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_PostalCode_07 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -…
## $ dummyhash_PostalCode_08 <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, …
## $ dummyhash_PostalCode_09 <int> 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_PostalCode_10 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ dummyhash_PostalCode_11 <int> 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, 0, 0…
## $ dummyhash_PostalCode_12 <int> 1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dummyhash_PostalCode_13 <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, …
## $ dummyhash_PostalCode_14 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dummyhash_PostalCode_15 <int> 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0,…
wflw_fit_xgb_hash <- workflow() %>%
add_model(
spec = boost_tree(mode = "classification") %>% set_engine("xgboost")
) %>%
add_recipe(recipe_spec_hash) %>%
fit(training(splits))
bind_cols(
wflw_fit_xgb_hash %>% predict(testing(splits), type = "prob"),
testing(splits)
) %>%
yardstick::roc_auc(Target, .pred_1)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.704
The histogram shows the most important variables:
wflw_fit_xgb_hash$fit$fit$fit %>% vip() #gives a histogram of the most important features
We can look at different variables - of particular interest are the ones highest on the histogram for example:
full_data_tbl %>%
ggplot(aes(inv_tenure, fill = Target)) +
geom_density(alpha = 0.5)
full_data_tbl$inv_tenure %>% range()
## [1] -1816 -1258
full_data_tbl %>%
ggplot(aes(song_len_q50, fill = Target)) +
geom_density(alpha = 0.5)
full_data_tbl$song_len_q50 %>% range()
## [1] 199209 312946
full_data_tbl %>%
ggplot(aes(artist_UMAP19, fill = Target)) +
geom_density(alpha = 0.5)
full_data_tbl$artist_UMAP19 %>% range()
## [1] -0.5484314 0.8686484
bind_cols(
wflw_fit_xgb_hash %>% predict(full_data_tbl, type = "prob"),
customers_joined_tbl
) %>%
write_rds("data/customer_predictions_tbl.rds")
This analysis determined relationships between customers through the UMAP variables, combined various data tables into more useful data frames for analysis, and reduced dimensionality of the models.
The result is the behaviour of customers, along with a model of the 90 day likelihood of a customer purchasing again have been created. The full details of this can be viewed in the shiny app.